Implement charset handling in WebRequestConcern

- The `force_encoding` and `unzip` options in WebsiteAgent are moved to
  WebRequestConcern so other users of the concern such as RssAgent can
  benefit from them.

- WebRequestConcern detects a charset specified in the Content-Type
  header to decode the content properly, and if it is missing the
  content is assumed to be encoded in UTF-8 unless it has a binary MIME
  type. Not all Faraday adopters handle character encodings, and
  Faraday passes through what is returned from the backend, so we need
  to do this on our own. (cf. lostisland/faraday#139)

- WebRequestConcern now converts text contents to UTF-8, so agents can
  handle non-UTF-8 data without having to deal with encodings
  themselves. Previously, WebsiteAgent in "json"/"text" modes and
  RssAgent would suffer from encoding errors when dealing with non-UTF-8
  contents. WebsiteAgent in "html"/"xml" modes did not have this
  problem because Nokogiri would always return results in UTF-8
  independent of the input encoding.

This should fix #608.

Akinori MUSHA 9 years ago
parent
commit
6f667a4973
3 changed files with 65 additions and 21 deletions
  1. 63 1
      app/concerns/web_request_concern.rb
  2. 1 0
      app/models/agents/rss_agent.rb
  3. 1 20
      app/models/agents/website_agent.rb

+ 63 - 1
app/concerns/web_request_concern.rb

@@ -14,6 +14,46 @@ module WebRequestConcern
14 14
     end
15 15
   end
16 16
 
17
+  class CharacterEncoding < Faraday::Middleware
18
+    def initialize(app, force_encoding: nil, default_encoding: nil, unzip: nil)
19
+      super(app)
20
+      @force_encoding   = force_encoding
21
+      @default_encoding = default_encoding
22
+      @unzip            = unzip
23
+    end
24
+
25
+    def call(env)
26
+      @app.call(env).on_complete do |env|
27
+        body = env[:body]
28
+
29
+        case @unzip
30
+        when 'gzip'.freeze
31
+          body.replace(ActiveSupport::Gzip.decompress(body))
32
+        end
33
+
34
+        case
35
+        when @force_encoding
36
+          encoding = @force_encoding
37
+        when body.encoding == Encoding::ASCII_8BIT
38
+          # Not all Faraday adapters support automatic charset
39
+          # detection, so we do that.
40
+          case env[:response_headers][:content_type]
41
+          when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i
42
+            encoding = Encoding.find($1) rescue nil
43
+          when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i
44
+            encoding = @default_encoding
45
+          else
46
+            # Never try to transcode a binary content
47
+            return
48
+          end
49
+        end
50
+        body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8
51
+      end
52
+    end
53
+  end
54
+
55
+  Faraday::Response.register_middleware character_encoding: CharacterEncoding
56
+
17 57
   extend ActiveSupport::Concern
18 58
 
19 59
   def validate_web_request_options!
@@ -34,6 +74,23 @@ module WebRequestConcern
34 74
     rescue ArgumentError => e
35 75
       errors.add(:base, e.message)
36 76
     end
77
+
78
+    if (encoding = options['force_encoding']).present?
79
+      case encoding
80
+      when String
81
+        begin
82
+          Encoding.find(encoding)
83
+        rescue ArgumentError
84
+          errors.add(:base, "Unknown encoding: #{encoding.inspect}")
85
+        end
86
+      else
87
+        errors.add(:base, "force_encoding must be a string")
88
+      end
89
+    end
90
+  end
91
+
92
+  def default_encoding
93
+    Encoding::UTF_8
37 94
   end
38 95
 
39 96
   def faraday
@@ -44,6 +101,11 @@ module WebRequestConcern
44 101
     }
45 102
 
46 103
     @faraday ||= Faraday.new(faraday_options) { |builder|
104
+      builder.response :character_encoding,
105
+                       force_encoding: interpolated['force_encoding'].presence,
106
+                       default_encoding: default_encoding,
107
+                       unzip: interpolated['unzip'].presence
108
+
47 109
       builder.headers = headers if headers.length > 0
48 110
 
49 111
       builder.headers[:user_agent] = user_agent
@@ -51,7 +113,7 @@ module WebRequestConcern
51 113
       builder.use FaradayMiddleware::FollowRedirects
52 114
       builder.request :url_encoded
53 115
 
54
-      if boolify(options['disable_url_encoding'])
116
+      if boolify(interpolated['disable_url_encoding'])
55 117
         builder.options.params_encoder = DoNotEncoder
56 118
       end
57 119
 

+ 1 - 0
app/models/agents/rss_agent.rb

@@ -29,6 +29,7 @@ module Agents
29 29
           * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
30 30
           * `disable_ssl_verification` - Set to `true` to disable ssl verification.
31 31
           * `disable_url_encoding` - Set to `true` to disable url encoding.
32
+          * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header.  Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
32 33
           * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
33 34
           * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
34 35
 

+ 1 - 20
app/models/agents/website_agent.rb

@@ -87,7 +87,7 @@ module Agents
87 87
 
88 88
       Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
89 89
 
90
-      Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
90
+      Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header.  Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
91 91
 
92 92
       Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`).
93 93
 
@@ -157,19 +157,6 @@ module Agents
157 157
         errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
158 158
       end
159 159
 
160
-      if (encoding = options['force_encoding']).present?
161
-        case encoding
162
-        when String
163
-          begin
164
-            Encoding.find(encoding)
165
-          rescue ArgumentError
166
-            errors.add(:base, "Unknown encoding: #{encoding.inspect}")
167
-          end
168
-        else
169
-          errors.add(:base, "force_encoding must be a string")
170
-        end
171
-      end
172
-
173 160
       validate_web_request_options!
174 161
     end
175 162
 
@@ -284,12 +271,6 @@ module Agents
284 271
       interpolation_context.stack {
285 272
         interpolation_context['_response_'] = ResponseDrop.new(response)
286 273
         body = response.body
287
-        if (encoding = interpolated['force_encoding']).present?
288
-          body = body.encode(Encoding::UTF_8, encoding)
289
-        end
290
-        if interpolated['unzip'] == "gzip"
291
-          body = ActiveSupport::Gzip.decompress(body)
292
-        end
293 274
         doc = parse(body)
294 275
 
295 276
         if extract_full_json?